import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv(r'sales_data_sample.csv',sep=",", encoding='Latin-1')
df.head()
| ORDERNUMBER | QUANTITYORDERED | PRICEEACH | ORDERLINENUMBER | SALES | ORDERDATE | STATUS | QTR_ID | MONTH_ID | YEAR_ID | ... | ADDRESSLINE1 | ADDRESSLINE2 | CITY | STATE | POSTALCODE | COUNTRY | TERRITORY | CONTACTLASTNAME | CONTACTFIRSTNAME | DEALSIZE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10107 | 30 | 95.70 | 2 | 2871.00 | 2/24/2003 0:00 | Shipped | 1 | 2 | 2003 | ... | 897 Long Airport Avenue | NaN | NYC | NY | 10022 | USA | NaN | Yu | Kwai | Small |
| 1 | 10121 | 34 | 81.35 | 5 | 2765.90 | 5/7/2003 0:00 | Shipped | 2 | 5 | 2003 | ... | 59 rue de l'Abbaye | NaN | Reims | NaN | 51100 | France | EMEA | Henriot | Paul | Small |
| 2 | 10134 | 41 | 94.74 | 2 | 3884.34 | 7/1/2003 0:00 | Shipped | 3 | 7 | 2003 | ... | 27 rue du Colonel Pierre Avia | NaN | Paris | NaN | 75508 | France | EMEA | Da Cunha | Daniel | Medium |
| 3 | 10145 | 45 | 83.26 | 6 | 3746.70 | 8/25/2003 0:00 | Shipped | 3 | 8 | 2003 | ... | 78934 Hillside Dr. | NaN | Pasadena | CA | 90003 | USA | NaN | Young | Julie | Medium |
| 4 | 10159 | 49 | 100.00 | 14 | 5205.27 | 10/10/2003 0:00 | Shipped | 4 | 10 | 2003 | ... | 7734 Strong St. | NaN | San Francisco | CA | NaN | USA | NaN | Brown | Julie | Medium |
5 rows × 25 columns
pp.ProfileReport(df)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 302 non-null object 17 CITY 2823 non-null object 18 STATE 1337 non-null object 19 POSTALCODE 2747 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 1749 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB
df.rename(columns = {'QTR_ID':'QTR'}, inplace = True)
df.rename(columns = {'MONTH_ID':'MONTH'}, inplace = True)
df.rename(columns = {'YEAR_ID':'YEAR'}, inplace = True)
df.isnull().sum()
ORDERNUMBER 0 QUANTITYORDERED 0 PRICEEACH 0 ORDERLINENUMBER 0 SALES 0 ORDERDATE 0 STATUS 0 QTR 0 MONTH 0 YEAR 0 PRODUCTLINE 0 MSRP 0 PRODUCTCODE 0 CUSTOMERNAME 0 PHONE 0 ADDRESSLINE1 0 ADDRESSLINE2 2521 CITY 0 STATE 1486 POSTALCODE 76 COUNTRY 0 TERRITORY 1074 CONTACTLASTNAME 0 CONTACTFIRSTNAME 0 DEALSIZE 0 dtype: int64
df[['ADDRESSLINE2']] = df[['ADDRESSLINE2']].replace(to_replace=np.nan, value='Not Available')
df[['STATE']] = df[['STATE']].replace(to_replace=np.nan, value='Not Available')
df[['TERRITORY']] = df[['TERRITORY']].replace(to_replace=np.nan, value='Not Available')
df[['POSTALCODE']] = df[['POSTALCODE']].replace(to_replace=np.nan, value=0)
df.head()
| ORDERNUMBER | QUANTITYORDERED | PRICEEACH | ORDERLINENUMBER | SALES | ORDERDATE | STATUS | QTR | MONTH | YEAR | ... | ADDRESSLINE1 | ADDRESSLINE2 | CITY | STATE | POSTALCODE | COUNTRY | TERRITORY | CONTACTLASTNAME | CONTACTFIRSTNAME | DEALSIZE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10107 | 30 | 95.70 | 2 | 2871.00 | 2/24/2003 0:00 | Shipped | 1 | 2 | 2003 | ... | 897 Long Airport Avenue | Not Available | NYC | NY | 10022 | USA | Not Available | Yu | Kwai | Small |
| 1 | 10121 | 34 | 81.35 | 5 | 2765.90 | 5/7/2003 0:00 | Shipped | 2 | 5 | 2003 | ... | 59 rue de l'Abbaye | Not Available | Reims | Not Available | 51100 | France | EMEA | Henriot | Paul | Small |
| 2 | 10134 | 41 | 94.74 | 2 | 3884.34 | 7/1/2003 0:00 | Shipped | 3 | 7 | 2003 | ... | 27 rue du Colonel Pierre Avia | Not Available | Paris | Not Available | 75508 | France | EMEA | Da Cunha | Daniel | Medium |
| 3 | 10145 | 45 | 83.26 | 6 | 3746.70 | 8/25/2003 0:00 | Shipped | 3 | 8 | 2003 | ... | 78934 Hillside Dr. | Not Available | Pasadena | CA | 90003 | USA | Not Available | Young | Julie | Medium |
| 4 | 10159 | 49 | 100.00 | 14 | 5205.27 | 10/10/2003 0:00 | Shipped | 4 | 10 | 2003 | ... | 7734 Strong St. | Not Available | San Francisco | CA | 0 | USA | Not Available | Brown | Julie | Medium |
5 rows × 25 columns
df.isna().any()
ORDERNUMBER False QUANTITYORDERED False PRICEEACH False ORDERLINENUMBER False SALES False ORDERDATE False STATUS False QTR False MONTH False YEAR False PRODUCTLINE False MSRP False PRODUCTCODE False CUSTOMERNAME False PHONE False ADDRESSLINE1 False ADDRESSLINE2 False CITY False STATE False POSTALCODE False COUNTRY False TERRITORY False CONTACTLASTNAME False CONTACTFIRSTNAME False DEALSIZE False dtype: bool
print(df['PRODUCTLINE'].unique())
['Motorcycles' 'Classic Cars' 'Trucks and Buses' 'Vintage Cars' 'Planes' 'Ships' 'Trains']
soldForEachProduct = df.query("STATUS=='Shipped'")
soldForEachProduct = soldForEachProduct.filter(['PRODUCTLINE','SALES'])
soldForEachProduct = soldForEachProduct.groupby('PRODUCTLINE').sum()
soldForEachProduct
| SALES | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 3701760.33 |
| Motorcycles | 1129573.83 |
| Planes | 866466.57 |
| Ships | 591172.76 |
| Trains | 215352.57 |
| Trucks and Buses | 1044097.39 |
| Vintage Cars | 1743077.63 |
msrpForEachProduct = df.query("STATUS=='Shipped'")
msrpForEachProduct = msrpForEachProduct.filter(['PRODUCTLINE','MSRP','QUANTITYORDERED'])
msrpForEachProductTotal = msrpForEachProduct['MSRP'] * msrpForEachProduct ['QUANTITYORDERED']
msrpForEachProduct['MSRP'] = msrpForEachProductTotal
msrpForEachProduct = msrpForEachProduct.filter(['PRODUCTLINE','MSRP']).groupby('PRODUCTLINE').sum()
msrpForEachProduct
| MSRP | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 3861333 |
| Motorcycles | 1102699 |
| Planes | 835617 |
| Ships | 586467 |
| Trains | 188044 |
| Trucks and Buses | 1026913 |
| Vintage Cars | 1654406 |
revenueForEachProduct = soldForEachProduct['SALES'] - msrpForEachProduct['MSRP']
revenueForEachProduct = pd.DataFrame({
"REVENUE":pd.Series(revenueForEachProduct)
})
revenueForEachProduct
| REVENUE | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | -159572.67 |
| Motorcycles | 26874.83 |
| Planes | 30849.57 |
| Ships | 4705.76 |
| Trains | 27308.57 |
| Trucks and Buses | 17184.39 |
| Vintage Cars | 88671.63 |
totalPricesForEachProduct = df
totalPricesForEachProduct = totalPricesForEachProduct.filter(['PRODUCTLINE','MSRP','QUANTITYORDERED'])
priceTotalForEachProduct = totalPricesForEachProduct['MSRP'] * totalPricesForEachProduct['QUANTITYORDERED']
totalPricesForEachProduct['TOTALPRICE'] = priceTotalForEachProduct
totalPricesForEachProduct = totalPricesForEachProduct.filter(['PRODUCTLINE','TOTALPRICE']).groupby('PRODUCTLINE').sum()
totalPricesForEachProduct
| TOTALPRICE | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 4084217 |
| Motorcycles | 1139168 |
| Planes | 952075 |
| Ships | 699304 |
| Trains | 197044 |
| Trucks and Buses | 1110388 |
| Vintage Cars | 1815146 |
totalUnitsSoldForEachProduct = df
totalUnitsSoldForEachProduct = totalUnitsSoldForEachProduct.filter(['PRODUCTLINE','QUANTITYORDERED'])
totalUnitsSoldForEachProduct = totalUnitsSoldForEachProduct.groupby('PRODUCTLINE').sum()
totalUnitsSoldForEachProduct
| QUANTITYORDERED | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 33992 |
| Motorcycles | 11663 |
| Planes | 10727 |
| Ships | 8127 |
| Trains | 2712 |
| Trucks and Buses | 10777 |
| Vintage Cars | 21069 |
averageForEachProduct = totalPricesForEachProduct['TOTALPRICE'] / totalUnitsSoldForEachProduct['QUANTITYORDERED']
averageForEachProduct = pd.DataFrame({
"AVERAGE":pd.Series(averageForEachProduct)
})
averageForEachProduct
| AVERAGE | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 120.152301 |
| Motorcycles | 97.673669 |
| Planes | 88.755011 |
| Ships | 86.047004 |
| Trains | 72.656342 |
| Trucks and Buses | 103.033126 |
| Vintage Cars | 86.152451 |
totalNumberOfOrdersForEachProduct = df
totalNumberOfOrdersForEachProduct = totalNumberOfOrdersForEachProduct.filter(['PRODUCTLINE']).groupby('PRODUCTLINE').size()
totalNumberOfOrdersForEachProduct = pd.DataFrame({
"TOTALORDERES":pd.Series(totalNumberOfOrdersForEachProduct)
})
totalNumberOfOrdersForEachProduct
| TOTALORDERES | |
|---|---|
| PRODUCTLINE | |
| Classic Cars | 967 |
| Motorcycles | 331 |
| Planes | 306 |
| Ships | 234 |
| Trains | 77 |
| Trucks and Buses | 301 |
| Vintage Cars | 607 |
allDataForProducts = revenueForEachProduct
allDataForProducts = pd.DataFrame({
"TOTALREVENUE":pd.Series(revenueForEachProduct['REVENUE']),
"TOTALUNITSSOLD":pd.Series(totalUnitsSoldForEachProduct['QUANTITYORDERED']),
"AVERAGEPRICEPERUNIT":pd.Series(averageForEachProduct['AVERAGE']),
"TOTALNUMEROFORDERS":pd.Series(totalNumberOfOrdersForEachProduct['TOTALORDERES']),
})
allDataForProducts
| TOTALREVENUE | TOTALUNITSSOLD | AVERAGEPRICEPERUNIT | TOTALNUMEROFORDERS | |
|---|---|---|---|---|
| PRODUCTLINE | ||||
| Classic Cars | -159572.67 | 33992 | 120.152301 | 967 |
| Motorcycles | 26874.83 | 11663 | 97.673669 | 331 |
| Planes | 30849.57 | 10727 | 88.755011 | 306 |
| Ships | 4705.76 | 8127 | 86.047004 | 234 |
| Trains | 27308.57 | 2712 | 72.656342 | 77 |
| Trucks and Buses | 17184.39 | 10777 | 103.033126 | 301 |
| Vintage Cars | 88671.63 | 21069 | 86.152451 | 607 |
rank = allDataForProducts.rank()
rank
| TOTALREVENUE | TOTALUNITSSOLD | AVERAGEPRICEPERUNIT | TOTALNUMEROFORDERS | |
|---|---|---|---|---|
| PRODUCTLINE | ||||
| Classic Cars | 1.0 | 7.0 | 7.0 | 7.0 |
| Motorcycles | 4.0 | 5.0 | 5.0 | 5.0 |
| Planes | 6.0 | 3.0 | 4.0 | 4.0 |
| Ships | 2.0 | 2.0 | 2.0 | 2.0 |
| Trains | 5.0 | 1.0 | 1.0 | 1.0 |
| Trucks and Buses | 3.0 | 4.0 | 6.0 | 3.0 |
| Vintage Cars | 7.0 | 6.0 | 3.0 | 6.0 |
ax = allDataForProducts.plot.line(figsize = (10,4))
soldForEachMonth = df.query("STATUS=='Shipped'")
soldForEachMonth = soldForEachMonth.filter(['MONTH','SALES'])
soldForEachMonth = soldForEachMonth.groupby('MONTH').sum()
soldForEachMonth = soldForEachMonth['SALES']
soldForEachMonth
MONTH 1 741601.08 2 810441.90 3 700685.67 4 538976.00 5 639368.06 6 354337.88 7 514875.97 8 659310.57 9 584724.27 10 1043953.71 11 2068546.85 12 634679.12 Name: SALES, dtype: float64
msrpForEachMonth = df.query("STATUS=='Shipped'")
msrpForEachMonth = msrpForEachMonth.filter(['MONTH','MSRP','QUANTITYORDERED'])
msrpTotal = msrpForEachMonth['MSRP'] * msrpForEachMonth ['QUANTITYORDERED']
msrpForEachMonth['MSRP'] = msrpTotal
msrpForEachMonth = msrpForEachMonth.filter(['MONTH','MSRP']).groupby('MONTH').sum()
msrpForEachMonth
| MSRP | |
|---|---|
| MONTH | |
| 1 | 752510 |
| 2 | 800896 |
| 3 | 691492 |
| 4 | 543105 |
| 5 | 636337 |
| 6 | 356616 |
| 7 | 510282 |
| 8 | 665198 |
| 9 | 575531 |
| 10 | 1023170 |
| 11 | 2067668 |
| 12 | 632674 |
totalRevenueForEachMonth = soldForEachMonth - msrpForEachMonth['MSRP']
totalRevenueForEachMonth = pd.DataFrame({
"REVENUE":pd.Series(totalRevenueForEachMonth)
})
totalRevenueForEachMonth
| REVENUE | |
|---|---|
| MONTH | |
| 1 | -10908.92 |
| 2 | 9545.90 |
| 3 | 9193.67 |
| 4 | -4129.00 |
| 5 | 3031.06 |
| 6 | -2278.12 |
| 7 | 4593.97 |
| 8 | -5887.43 |
| 9 | 9193.27 |
| 10 | 20783.71 |
| 11 | 878.85 |
| 12 | 2005.12 |
totalPricesForEachMonth = df
totalPricesForEachMonth = totalPricesForEachMonth.filter(['MONTH','MSRP','QUANTITYORDERED'])
priceTotal = totalPricesForEachMonth['MSRP'] * totalPricesForEachMonth ['QUANTITYORDERED']
totalPricesForEachMonth['TOTALPRICE'] = priceTotal
totalPricesForEachMonth = totalPricesForEachMonth.filter(['MONTH','TOTALPRICE']).groupby('MONTH').sum()
totalPricesForEachMonth
| TOTALPRICE | |
|---|---|
| MONTH | |
| 1 | 796703 |
| 2 | 800896 |
| 3 | 744580 |
| 4 | 673007 |
| 5 | 919360 |
| 6 | 459201 |
| 7 | 510282 |
| 8 | 665198 |
| 9 | 575531 |
| 10 | 1103077 |
| 11 | 2116833 |
| 12 | 632674 |
totalUnitsSoldForEachMonth = df
totalUnitsSoldForEachMonth = totalUnitsSoldForEachMonth.filter(['MONTH','QUANTITYORDERED'])
totalUnitsSoldForEachMonth = totalUnitsSoldForEachMonth.groupby('MONTH').sum()
totalUnitsSoldForEachMonth
| QUANTITYORDERED | |
|---|---|
| MONTH | |
| 1 | 7997 |
| 2 | 7903 |
| 3 | 7585 |
| 4 | 6704 |
| 5 | 8992 |
| 6 | 4620 |
| 7 | 4899 |
| 8 | 6538 |
| 9 | 5681 |
| 10 | 10998 |
| 11 | 20857 |
| 12 | 6293 |
averageForEachMonth = totalUnitsSoldForEachMonth
averageForEachMonth['TOTALPRICE'] = totalPricesForEachMonth['TOTALPRICE']
averageForEachMonth['AVERAGEPRICEPERUNIT'] = averageForEachMonth['TOTALPRICE']/averageForEachMonth['QUANTITYORDERED']
averageForEachMonth = averageForEachMonth.filter(['MONTH','AVERAGEPRICEPERUNIT']).groupby('MONTH').head()
averageForEachMonth
| AVERAGEPRICEPERUNIT | |
|---|---|
| MONTH | |
| 1 | 99.625234 |
| 2 | 101.340757 |
| 3 | 98.164799 |
| 4 | 100.388872 |
| 5 | 102.241993 |
| 6 | 99.394156 |
| 7 | 104.160441 |
| 8 | 101.743347 |
| 9 | 101.308044 |
| 10 | 100.297963 |
| 11 | 101.492688 |
| 12 | 100.536151 |
allDataForMonths = revenueForEachProduct
allDataForMonths = pd.DataFrame({
"TOTALREVENUE":pd.Series(totalRevenueForEachMonth['REVENUE']),
"TOTALUNITSSOLD":pd.Series(totalUnitsSoldForEachMonth['QUANTITYORDERED']),
"AVERAGEPRICEPERUNIT":pd.Series(averageForEachMonth['AVERAGEPRICEPERUNIT']),
})
allDataForMonths
| TOTALREVENUE | TOTALUNITSSOLD | AVERAGEPRICEPERUNIT | |
|---|---|---|---|
| MONTH | |||
| 1 | -10908.92 | 7997 | 99.625234 |
| 2 | 9545.90 | 7903 | 101.340757 |
| 3 | 9193.67 | 7585 | 98.164799 |
| 4 | -4129.00 | 6704 | 100.388872 |
| 5 | 3031.06 | 8992 | 102.241993 |
| 6 | -2278.12 | 4620 | 99.394156 |
| 7 | 4593.97 | 4899 | 104.160441 |
| 8 | -5887.43 | 6538 | 101.743347 |
| 9 | 9193.27 | 5681 | 101.308044 |
| 10 | 20783.71 | 10998 | 100.297963 |
| 11 | 878.85 | 20857 | 101.492688 |
| 12 | 2005.12 | 6293 | 100.536151 |
ax = allDataForMonths.plot.line(figsize = (10,4))
soldForEachMonth = df.query("STATUS=='Shipped'")
soldForEachMonth = soldForEachMonth.filter(['CITY','SALES'])
soldForEachMonth = soldForEachMonth.groupby('CITY').size().sort_values(ascending=False)
soldForEachMonth.head()
CITY Madrid 258 San Rafael 178 NYC 138 Singapore 79 Paris 70 dtype: int64